Loading libraries and set directories

library(dplyr)
library(EnsDb.Hsapiens.v79)
library(EnsDb.Hsapiens.v75)
library(limma)
library(readr)
library(DESeq2)
library(EDASeq)
library(ggplot2)
library(ggrepel)
library(org.Hs.eg.db)
library(clusterProfiler)
library(enrichplot)
library(gprofiler2)
library(msigdbr)


MEDIAsave="/home/mclaudia/MEDIA Project/Codici_finali/"
wdd="/home/mclaudia/MEDIA Project/OAStratification-pipeline/"

Retrieve gene annotation

ensdf19 <- GenomicFeatures::genes(EnsDb.Hsapiens.v75, return.type="DataFrame")
ens2gene19 <- as.data.frame(ensdf19[,c("gene_id","gene_name")])

ensdf<- GenomicFeatures::genes(EnsDb.Hsapiens.v79, return.type="DataFrame")
ens2gene <- as.data.frame(ensdf[,c("gene_id","gene_name")])

Paired Dataset from Steinberg et al.,2017 (doi:10.1038/s41598-017-09335-6): Dataset 2

Load merged Summarized Experiment data and pre-processing

se=readRDS(file = paste0(MEDIAsave,"Mydata_EGAD00001001331_merged.rds"))
colData(se)$Donor=factor(colData(se)$Donor)
colData(se)$Status=factor(colData(se)$Status)

keep <- rowSums(assay(se)>=15) >= 6   #at least 50% of the patients
se1 <- se[keep,]

tmp <-assay(se1)
tmp <-cbind(gene_id=rownames(tmp),tmp)
tmp <- merge(tmp,ens2gene19,by.y="gene_id")

Fix duplicates: mean expression value across transcripts has been used for each duplicate gene

dupp <-tmp[duplicated(tmp$gene_name),]
gn_dup <-unique(dupp$gene_name)

tmp1 <-tmp[!tmp$gene_name %in% gn_dup,]

counts2 <-as.data.frame(matrix(NA, length(gn_dup),ncol(tmp)))
colnames(counts2) <-colnames(tmp)
rownames(counts2) <-paste0("ENS_",1:length(gn_dup))
counts2 <-counts2[,-c(1,26)]


for(i in 1:length(gn_dup)){
  x=gn_dup[i]
  y=tmp[tmp$gene_name %in% x,-c(1,26)]
  
  counts2[i,1:ncol(counts2)] <-apply(y,2,function(x) round(mean(as.numeric(x))))
}
counts2 <-cbind(gene_id=rownames(counts2),counts2,gene_name=gn_dup)
tmp3 <-rbind(tmp1,as.matrix(counts2))
dataDds <-tmp3

ccounts <-tmp3[,2:(ncol(tmp3)-1)]
ccounts <-t(apply(ccounts,1,as.numeric))
rownames(ccounts) <-tmp3$gene_id
colnames(ccounts) <-colnames(tmp3[,2:(ncol(tmp3)-1)])

dds <- DESeqDataSetFromMatrix(countData = ccounts,
                               colData = colData(se),
                               design = ~ Donor + Status)

dds$Status <- factor(dds$Status, levels = c("Normal","Osteoarthritis"))
dds$Status <- relevel(dds$Status, ref = "Normal")


keep <- rowSums(counts(dds)>=15) >= 6  #re-check low counts genes, 0 in this case
sum(keep) #19124
## [1] 19124
cat("Merged data Dimension after filtering and pre-processing: ", dim(dds))
## Merged data Dimension after filtering and pre-processing:  19124 24

Principal component analysis

vsd <- vst(dds, blind=FALSE)
pcaData <- plotPCA(vsd, intgroup=c("Donor", "Status"), returnData=TRUE)
percentVar <- round(100 * attr(pcaData, "percentVar"))

ggplot(pcaData, aes(PC1, PC2, color=Donor, shape=Status,label =  rownames(pcaData))) +
  geom_point(size=3,alpha=1) +
  xlab(paste0("PC1: ",percentVar[1],"% variance")) +
  ylab(paste0("PC2: ",percentVar[2],"% variance")) + 
  coord_fixed()+
  ggtitle("All samples after merging replicate Runs")+
  theme_bw()

Batch correction

assay(vsd)<-limma::removeBatchEffect(assay(vsd), vsd$Donor)

pcaData <- plotPCA(vsd, intgroup=c("Donor", "Status"), returnData=TRUE)
percentVar <- round(100 * attr(pcaData, "percentVar"))

ggplot(pcaData, aes(PC1, PC2, color=Donor, shape=Status,label =  rownames(pcaData))) +
  geom_point(size=3,alpha=1) +
  xlab(paste0("PC1: ",percentVar[1],"% variance")) +
  ylab(paste0("PC2: ",percentVar[2],"% variance")) + 
  coord_fixed()+
  ggtitle("All samples after merging replicate Runs")+
  theme_bw()

Differential expression analysis using DESeq2

dds <- DESeq(dds)
resultsNames(dds)
##  [1] "Intercept"                       "Donor_SMBP051_vs_SMBP049"       
##  [3] "Donor_SMBP053_vs_SMBP049"        "Donor_SMBP054_vs_SMBP049"       
##  [5] "Donor_SMBP055_vs_SMBP049"        "Donor_SMBP056_vs_SMBP049"       
##  [7] "Donor_SMBP059_vs_SMBP049"        "Donor_SMBP060_vs_SMBP049"       
##  [9] "Donor_SMBP061_vs_SMBP049"        "Donor_SMBP064_vs_SMBP049"       
## [11] "Donor_SMBP065_vs_SMBP049"        "Donor_SMBP066_vs_SMBP049"       
## [13] "Status_Osteoarthritis_vs_Normal"
df.res <- as.data.frame(results(dds,name="Status_Osteoarthritis_vs_Normal" ,pAdjustMethod = "fdr"))
summary(df.res)
##     baseMean       log2FoldChange          lfcSE              stat         
##  Min.   :      7   Min.   :-2.052493   Min.   :0.03297   Min.   :-5.82675  
##  1st Qu.:     62   1st Qu.:-0.116090   1st Qu.:0.10228   1st Qu.:-0.90492  
##  Median :    318   Median : 0.000866   Median :0.13994   Median : 0.00505  
##  Mean   :   2015   Mean   : 0.037667   Mean   :0.16481   Mean   : 0.07884  
##  3rd Qu.:   1122   3rd Qu.: 0.142020   3rd Qu.:0.20596   3rd Qu.: 0.98097  
##  Max.   :3442514   Max.   : 2.257059   Max.   :0.97056   Max.   : 6.86018  
##      pvalue            padj          
##  Min.   :0.0000   Min.   :0.0000001  
##  1st Qu.:0.1191   1st Qu.:0.4759139  
##  Median :0.3460   Median :0.6916913  
##  Mean   :0.3981   Mean   :0.6523361  
##  3rd Qu.:0.6530   3rd Qu.:0.8705838  
##  Max.   :0.9999   Max.   :0.9999275
df.res$ens <-rownames(df.res)
dataDds <-dataDds[,c(1,26)]
colnames(dataDds) <-c("ens","gene_id")

df.res<-merge(dataDds,df.res,by.x="ens")
df.res <-df.res[,c(1,3:8,2)]
colnames(df.res)[8] <-"hgnc_symbol"
rownames(df.res) <-df.res$ens
rm(dataDds,tmp3)

Save files

save(df.res, file=paste0(MEDIAsave,"results_DE_EGApaired.RData"))

Volcano Plot of DE genes

annot <-df.res[,c(3,7,8)]
annot <-annot[complete.cases(annot),]
annot$col <-"gray50"
annot[annot$log2FoldChange <= -1 & annot$padj <=0.05, "col"] <- "springgreen3"
annot[annot$log2FoldChange  >= 1 & annot$padj <=0.05, "col"] <- "red"

annot$label <- NA
annot[annot$log2FoldChange  >= 1.5 & annot$padj <=0.05, "label"] <- annot[annot$log2FoldChange  >= 1.5 & annot$padj <=0.05, "hgnc_symbol"]
annot[annot$log2FoldChange  <= -1.5 & annot$padj <=0.05, "label"] <- annot[annot$log2FoldChange   <= -1.5 & annot$padj <=0.05, "hgnc_symbol"]

ggplot(data=annot, aes(x=log2FoldChange, y=-log10(padj), color=col,label=label)) +
  geom_point(size=1) +
  geom_label_repel(size=4,
                   min.segment.length = 0,direction = "both",
                   max.overlaps =40,
                   color="black",
                   segment.linetype=2) +
  scale_shape_manual(values=c(8, 19))+
  scale_color_manual(values = c("red","gray50","springgreen3"),
                     breaks = c("red","gray50","springgreen3"),
                     labels = c("UP","notDE","DOWN")) +
  labs(color="Legend")+
  xlab("log2(FC)")+
  ylab("-log10(FDR)")+
  theme_bw()+
  ggtitle("Volcano plot of DE genes - Dataset 2")+
geom_vline(xintercept=c(-1, 1), col="gray", linetype=2)+
  geom_hline(yintercept=-log10(0.05), col="gray", linetype=2)

Enrichment with GSEA: BP and REACTOME

hs=get("org.Hs.eg.db")

genes1 <- df.res[, c("log2FoldChange","hgnc_symbol")]
genes <- genes1$log2FoldChange
names(genes) <-genes1$hgnc_symbol

genes <-sort(genes, decreasing = TRUE)

gsebp <- gseGO(geneList=genes, 
               ont ="BP", 
               keyType = "SYMBOL", 
               pvalueCutoff = 0.01,
               eps = 0,
               verbose = TRUE, 
               OrgDb = hs, 
               pAdjustMethod = "fdr")
dotplot(gsebp, showCategory = 10, x="NES",split=".sign") + facet_grid(.~.sign)

rc <-msigdbr(species = "Homo sapiens", category = "C2", subcategory = "CP:REACTOME")
sig <-rc[,c(3,4)]
sig$gs_name <-gsub("^REACTOME_","",sig$gs_name)

gsreac <- clusterProfiler::GSEA(genes,TERM2GENE = sig,
                             eps = 0,
                             verbose = TRUE, 
                             minGSSize =5, pAdjustMethod = "fdr",
                             pvalueCutoff =0.1)
dotplot(gsreac, showCategory = 10, x="NES",split=".sign") + facet_grid(.~.sign)+
  theme(axis.text.y = element_text(size = 7))

Unpaired Dataset from Soul et al.,2017(doi:10.1136/annrheumdis-2017-212603): Dataset 3

Load data and pre-processing

load(paste0(wdd,"data/txi.RData"))
patientDetails<-read.csv(paste0(wdd,"data/patientDetails_all_withMed.csv"),stringsAsFactors = F)

txi$counts <- txi$counts[,match(unique(as.character(patientDetails$name)),colnames(txi$counts))]
txi$abundance <- txi$abundance[,match(unique(as.character(patientDetails$name)),colnames(txi$abundance))]
txi$length <- txi$length[,match(unique(as.character(patientDetails$name)),colnames(txi$length))]

patientDetails <-patientDetails[,-3]
patientDetails<-patientDetails[!duplicated(patientDetails$name),]
patientDetails$OA<-ifelse(grepl("SH0", patientDetails$name),"OA","NonOA")

Batches<-as.factor(patientDetails$batch)
Disease<- as.factor(patientDetails$OA)

colData<-data.frame(Batches=Batches,Disease)
dds<-DESeqDataSetFromTximport(txi,colData,~Batches+Disease)

filter <- rowSums(counts(dds)>=5) >= 35  #at least 50% of the samples
dds <- dds[filter,]

tmp <-counts(dds)
tmp <-cbind(gene_id=rownames(tmp),tmp)
tmp <-merge(tmp,ens2gene,by.y="gene_id")

Fix duplicates: mean expression value across transcripts has been used for each duplicate gene

dupp <-tmp[duplicated(tmp$gene_name),]
gn_dup <-unique(dupp$gene_name)

tmp1 <-tmp[!tmp$gene_name %in% gn_dup,]

counts2 <-as.data.frame(matrix(NA, length(gn_dup),ncol(tmp)))
colnames(counts2) <-colnames(tmp)
rownames(counts2) <-paste0("ENS_",1:length(gn_dup))
counts2 <-counts2[,-c(1,72)]

for(i in 1:length(gn_dup)){
  x=gn_dup[i]
  y=tmp[tmp$gene_name %in% x,-c(1,72)]
  
  counts2[i,1:ncol(counts2)] <-apply(y,2,function(x) round(mean(as.numeric(x))))
}
counts2 <-cbind(gene_id=rownames(counts2),counts2,gene_name=gn_dup)
tmp3 <-rbind(tmp1,as.matrix(counts2))

ccounts <-tmp3[,2:(ncol(tmp3)-1)]
ccounts <-t(apply(ccounts,1,as.numeric))
rownames(ccounts) <-tmp3$gene_id
colnames(ccounts) <-colnames(tmp3[,2:(ncol(tmp3)-1)])

dds0 <- DESeqDataSetFromMatrix(countData = ccounts,
                               colData = colData,
                               design = ~Batches+Disease)

filter <- rowSums(counts(dds0)>=5) >= 35  #re-check low counts genes
dds0 <- dds0[filter,]

Principal component analysis

vsd <- vst(dds0, blind=FALSE)
pcaData <- plotPCA(vsd, intgroup=c("Batches", "Disease"), returnData=TRUE)
percentVar <- round(100 * attr(pcaData, "percentVar"))

ggplot(pcaData, aes(PC1, PC2, color=Batches, shape=Disease,label =  rownames(pcaData))) +
  geom_point(size=3,alpha=1) +
  xlab(paste0("PC1: ",percentVar[1],"% variance")) +
  ylab(paste0("PC2: ",percentVar[2],"% variance")) + 
  coord_fixed()+
  ggtitle("All samples after merging replicate Runs")+
  theme_bw()

Batch correction

assay(vsd)<-limma::removeBatchEffect(assay(vsd), vsd$Batches)

pcaData <- plotPCA(vsd, intgroup=c("Batches", "Disease"), returnData=TRUE)
percentVar <- round(100 * attr(pcaData, "percentVar"))

ggplot(pcaData, aes(PC1, PC2, color=Batches, shape=Disease,label =  rownames(pcaData))) +
  geom_point(size=3,alpha=1) +
  xlab(paste0("PC1: ",percentVar[1],"% variance")) +
  ylab(paste0("PC2: ",percentVar[2],"% variance")) + 
  coord_fixed()+
  ggtitle("All samples after merging replicate Runs")+
  theme_bw()

Differential expression analysis using DESeq2

dds2<-DESeq(dds0)
resultsNames(dds2)
##  [1] "Intercept"           "Batches_2_vs_1"      "Batches_3_vs_1"     
##  [4] "Batches_4_vs_1"      "Batches_5_vs_1"      "Batches_6_vs_1"     
##  [7] "Batches_7_vs_1"      "Batches_8_vs_1"      "Batches_9_vs_1"     
## [10] "Batches_10_vs_1"     "Batches_11_vs_1"     "Batches_12_vs_1"    
## [13] "Batches_13_vs_1"     "Disease_OA_vs_NonOA"
OAvsnonOA<-as.data.frame(results(dds2,name="Disease_OA_vs_NonOA",pAdjustMethod = "fdr"))

OAvsnonOA_dup <-OAvsnonOA[grep("ENS_",rownames(OAvsnonOA)),]
names(gn_dup) <-counts2$gene_id

setdiff(names(gn_dup),rownames(OAvsnonOA_dup)) ## check if some corrected duplicates did not exceed the minimum of counts for at least half of the patients
## [1] "ENS_202"
#[1] "ENS_202"

which(names(gn_dup) %in% "ENS_202")
## [1] 202
# 202 
OAvsnonOA_dup <-OAvsnonOA_dup[names(gn_dup[-202]),]
OAvsnonOA_dup$hgnc_symbol <-gn_dup[-202]
OAvsnonOA_dup <-cbind(Row.names=rownames(OAvsnonOA_dup), OAvsnonOA_dup)

OAvsnonOA<-merge(OAvsnonOA,ens2gene,by.x="row.names",by.y="gene_id")
colnames(OAvsnonOA)[8] <-"hgnc_symbol"

OAvsnonOA_dup <-OAvsnonOA_dup[,colnames(OAvsnonOA)]
OAvsnonOA <-rbind(OAvsnonOA, OAvsnonOA_dup)
OAvsnonOA<-OAvsnonOA[order(OAvsnonOA$log2FoldChange,decreasing=TRUE),]

Save files

OAvsnonOA <-OAvsnonOA[complete.cases(OAvsnonOA),]

save(OAvsnonOA, dds0, OAvsnonOA_dup, gn_dup,file=paste0(MEDIAsave,"genide-bulk-unpaired.RData"))
save(OAvsnonOA,file=paste0(MEDIAsave,"genide-bulk-unpaired_DE.RData"))

Volcano Plot of DE genes

annot <-OAvsnonOA[,c(3,7,8)]
annot <-annot[complete.cases(annot),]
annot$col <-"gray50"
annot[annot$log2FoldChange <= -1.5 & annot$padj <=0.05, "col"] <- "springgreen3"
annot[annot$log2FoldChange  >= 1.5 & annot$padj <=0.05, "col"] <- "red"

annot$label <- NA
annot[annot$log2FoldChange  >= 3 & annot$padj <=0.05, "label"] <- annot[annot$log2FoldChange  >= 3 & annot$padj <=0.05, "hgnc_symbol"]
annot[annot$log2FoldChange  <= -3 & annot$padj <=0.05, "label"] <- annot[annot$log2FoldChange   <= -3 & annot$padj <=0.05, "hgnc_symbol"]

ggplot(data=annot, aes(x=log2FoldChange, y=-log10(padj), color=col,label=label)) +
  geom_point(size=1) +
  geom_label_repel(size=4,
                   min.segment.length = 0,direction = "both",
                   max.overlaps =40,
                   color="black",
                   segment.linetype=2) +
  scale_shape_manual(values=c(8, 19))+
  scale_color_manual(values = c("red","gray50","springgreen3"),
                     breaks = c("red","gray50","springgreen3"),
                     labels = c("UP","notDE","DOWN")) +
  labs(color="Legend")+
  xlab("log2(FC)")+
  ylab("-log10(FDR)")+
  ggtitle("Volcano plot of DE genes - Dataset 3")+
  theme_bw()+
geom_vline(xintercept=c(-2.5, 2.5), col="gray", linetype=2)+
  geom_hline(yintercept=-log10(0.05), col="gray", linetype=2)

Enrichment with GSEA: BP and REACTOME

gene <-OAvsnonOA[order(OAvsnonOA$log2FoldChange, decreasing = TRUE),]
genes <-gene$log2FoldChange
names(genes) <-gene$hgnc_symbol
hs=get("org.Hs.eg.db")

gsebp2 <- gseGO(geneList=genes, 
               ont ="BP", 
               keyType = "SYMBOL", 
               pvalueCutoff = 0.01,
               eps = 0,
               verbose = TRUE, 
               OrgDb = hs, 
               pAdjustMethod = "fdr")

dotplot(gsebp2, showCategory = 10, x="NES",split=".sign") + facet_grid(.~.sign)

rc <-msigdbr(species = "Homo sapiens", category = "C2", subcategory = "CP:REACTOME")
sig <-rc[,c(3,4)]
sig$gs_name <-gsub("^REACTOME_","",sig$gs_name)

gsreac2 <- clusterProfiler::GSEA(genes,TERM2GENE = sig,
                             eps = 0,
                             verbose = TRUE, 
                             minGSSize =5, pAdjustMethod = "fdr",
                             pvalueCutoff =0.1)
dotplot(gsreac2, showCategory = 10, x="NES",split=".sign") + facet_grid(.~.sign)+
  theme(axis.text.y = element_text(size = 7))